library(GenomicRanges)
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(epiwraps)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
## Loading required package: EnrichedHeatmap
## Loading required package: grid
## Loading required package: ComplexHeatmap
## ========================================
## ComplexHeatmap version 2.20.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ========================================
## EnrichedHeatmap version 1.34.0
## Bioconductor page: http://bioconductor.org/packages/EnrichedHeatmap/
## Github page: https://github.com/jokergoo/EnrichedHeatmap
## Documentation: http://bioconductor.org/packages/EnrichedHeatmap/
##
## If you use it in published research, please cite:
## Gu, Z. EnrichedHeatmap: an R/Bioconductor package for comprehensive
## visualization of genomic signal associations. BMC Genomics 2018.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(EnrichedHeatmap))
## ========================================
library(ggplot2)
library(rGREAT) # Gene Ontology enrichment among genomic regions
##
##
## ========================================
## rGREAT version 2.6.0
## Bioconductor page: http://bioconductor.org/packages/rGREAT/
## Github page: https://github.com/jokergoo/rGREAT
##
## If you use it in published research, please cite:
## Gu, Z. rGREAT: an R/Bioconductor package for functional enrichment
## on genomic regions. Bioinformatics 2023.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(rGREAT))
##
## Note: On Aug 19 2019 GREAT released version 4 where it supports `hg38`
## genome and removes some ontologies such pathways. `submitGreatJob()`
## still takes `hg19` as default. `hg38` can be specified by the `genome
## = 'hg38'` argument. To use the older versions such as 3.0.0, specify as
## `submitGreatJob(..., version = '3.0.0')`.
##
## From rGREAT version 1.99.0, it also implements the GREAT algorithm and
## it allows to integrate GREAT analysis with the Bioconductor annotation
## ecosystem. By default it supports more than 500 organisms. Check the
## new function `great()` and the new vignette.
## ========================================
setwd("/Users/catherinerohner/ETH/Bioinformatics/Assignment10")
download.file("https://ethz-ins.org/content/w10.assignment.zip", "w10.assignment.zip")
unzip("w10.assignment.zip")
list.files()
## [1] "assignment.html" "assignment.Rmd" "Creb1.bed"
## [4] "Creb1.bw" "Creb3.bed" "Creb3.bw"
## [7] "Creb3L1.bed" "Creb3L1.bw" "w10.assignment.zip"
peaks <- list.files(pattern="bed$")
# we first import the peaks
peaks <- lapply(peaks, rtracklayer::import.bed)
# we'll focus on the high-quality peaks
peaks <- lapply(peaks, FUN=function(x) x[x$score>800])
# we get the union of non-redundant regions
regions <- reduce(unlist(GRangesList(peaks)))
#l load tracks
tracks <- list.files(pattern="bw$")
We utilize enriched heatmaps to examine the binding patterns of CREB1, CREB3, and CREB3L1 across the genomic regions. The initial plots reveal clear signals for all three transcription factors, suggesting strong binding activity. However, the complexity and overlap in data require further organization to better identify and understand the underlying patterns
ese <- signal2Matrix(tracks, regions, extend=2000)
## Reading Creb1.bw
## Reading Creb3.bw
## Reading Creb3L1.bw
plotEnrichedHeatmaps(ese, use_raster=FALSE)
The graph illustrates that k-values of 3, 4, and 5 are potential choices for clustering. Based on our discussions in class, it is often advantageous to start with a higher number of clusters (k=5) and then consolidate them if they appear redundant.
cl2 <- clusterSignalMatrices(ese, k=2:10)
ggplot(cl2$varExplained, aes(k, varExplained)) + geom_line()
We initially segmented the data into five clusters, which explained 74% of the variance. A comparison with four clusters, which accounted for 70% of the variance, shows minimal improvement in variance explanation when increasing to five clusters. This marginal gain suggests that clustering into four groups might be more appropriate. Additionally, clusters 2 and 3 in the five-cluster model contain notably fewer data points, which could indicate overfitting. This observation warrants further analysis to confirm the optimal cluster count.
set.seed(123) # to ensure that it gives the same results everytime
cl <- clusterSignalMatrices(ese, k=5)
## ~74% of the variance explained by clusters
table(cl)
## cl
## 1 2 3 4 5
## 828 134 167 688 452
head(cl)
## [1] 5 4 1 1 4 4
## Levels: 1 2 3 4 5
length(cl)
## [1] 2269
length(regions)
## [1] 2269
rowData(ese)$cluster <- cl
Upon visualizing the data, Clusters 2, 4 and 5 appear remarkably similar, underscoring our previous observations about diminishing returns when adding more clusters. In line with the strategy discussed in class to merge similar clusters for a clearer and more meaningful analysis, I will proceed by reducing the number of clusters to four.
mycolors <- c("1"="red", "2"="blue", "3"="darkgreen", "4"="black", "5" = "yellow")
plotEnrichedHeatmaps(ese, row_split="cluster",trim=0.99, mean_color=mycolors, colors=c("white","darkred"),use_raster=FALSE)
The heatmap distinctly showcases four clusters, validating the decision to reduce the cluster count. Observations from the heatmap are as follows:
Cluster 1: Demonstrates the highest activity for Creb3L1, which is significantly supported by substantial signals from both Creb1 and Creb3. This diverse range of active transcription factors suggests a versatile regulatory region with multifunctional capabilities.
Cluster 2: Predominantly characterized by a strong signal for Creb1, with moderate expression of Creb3L1 and only minimal activity from Creb3. This pattern suggests that Cluster 2 has a specific functional emphasis, primarily driven by Creb1.
Cluster 3: Shows a balanced, high activity for both Creb1 and Creb3L1, but with very little presence of Creb3. This configuration indicates a unique regulatory interaction that differs from the other clusters, possibly involving synergistic effects between Creb1 and Creb3L1.
Cluster 4: Marked by a notably high signal for Creb3, setting it apart from the other clusters. This distinct profile underscores Creb3’s dominant role within this cluster, potentially indicating specialized regulatory functions exclusive to Creb3.
cl <- clusterSignalMatrices(ese, k=4)
## ~68% of the variance explained by clusters
rowData(ese)$cluster <- cl
table(cl)
## cl
## 1 2 3 4
## 247 708 896 418
head(cl)
## [1] 3 3 2 2 2 3
## Levels: 1 2 3 4
mycolors <- c("1"="red", "2"="blue", "3"="darkgreen", "4"="black")
plotEnrichedHeatmaps(ese, row_split="cluster", mean_color=mycolors, colors=c("white","darkred"),use_raster=FALSE)
Cluster 1: Displays a balanced level of activity between Creb3L1 and Creb1 and Creb3, indicating their co-regulation in this genomic region.
Cluster 2: Shows a distinct, strong signal for Creb1, highlighting its dominant role and specialized activity within this cluster.
Cluster 3: Initially appears similar to Cluster 1 but upon closer inspection, it shows a notably lower peak for Creb3 while maintaining more pronounced signals for Creb1 and Creb3L1. This suggests a unique and complex regulatory pattern, differentiating it from Cluster 1.
Cluster 4: Characterized by a high peak for Creb3, emphasizing its significant influence and specialized function in this cluster.
d <- meltSignals(ese, splitBy=cl)
rowData(ese)$cluster <- cl
ggplot(d, aes(position, mean, colour=sample)) + geom_line(size=1.2) + facet_wrap(~split)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Cluster 1: Displays high peaks for Creb3L1 accompanied by moderate signals for Creb1, suggesting both play significant roles in this cluster. Notably, Creb3, which was more visible in the unscaled data, is barely detectable here.
Cluster 2: Dominated by a strong signal for Creb3, with minimal activity from other transcription factors. This is a shift from the previous heatmap where Creb1 was predominant. The change from Creb1 to Creb3 dominance in the scaled heatmap suggests that although Creb3 might be weaker in absolute terms, it exhibits relatively stronger or more consistent activity within this cluster when scaled.
Cluster 3: Characterized by very high peaks for Creb1, underscoring its dominant influence in this cluster—a stark contrast to the other transcription factors. The scaling emphasizes Creb1’s impact, overshadowing Creb3L1 and rendering Creb3 nearly invisible, suggesting that Creb1’s role is considerably amplified under relative scaling.
Cluster 4: Shows prominent activity for Creb3L1, indicating its significant regulatory role, in contrast to its previous dominance by Creb3. This shift suggests that Creb3, while showing spikes of high activity in absolute terms that skew its perceived importance, exhibits less consistent activity than Creb3L1 when signals are scaled relatively.
These discrepancies are explainable due to the use of different methodologies in signal scaling and normalization. Absolute scaling, which presents raw signal intensities, can mask subtle interplays between transcription factors. In contrast, relative scaling highlights differences in expression patterns across clusters by normalizing these signals, revealing unique regulatory dynamics that might not be apparent with absolute values alone.
cl <- clusterSignalMatrices(ese, k=4, scaleRows = TRUE)
## ~86% of the variance explained by clusters
d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line() + facet_wrap(~split)
mycolors <- c("1"="red", "2"="blue", "3"="darkgreen", "4"="black")
plotEnrichedHeatmaps(ese, row_split = cl,mean_color=mycolors, scale_rows = "global",use_raster=FALSE)
Looking at the values, we can see that the enrichment analysis highlights a significant involvement of genomic regions in processes related to cell communication and response to stimuli, including “cell communication,” “cellular response to stimulus,” “signaling,” “response to stimulus,” “signal transduction,” and “response to chemical.” These processes show notable enrichment with observed region hits ranging from 180 to 311, indicating a strong representation within the analyzed genomic regions. Specifically, “cell communication” and “response to stimulus” are among the most enriched processes, with 270 and 311 hits respectively, suggesting these areas as key functions regulated by the genomic regions in this cluster. Fold enrichment values range from about 1.18 to 1.32, underscoring a moderate overrepresentation of these terms relative to the background. The hyper-p-values are highly significant, ranging from approximately 7.24e-07 to 0.0098, reinforcing the statistical significance of the enrichment.
Cluster 4 shows prominent activity for Creb3L1, indicating its significant regulatory role, in contrast to its previous dominance by Creb3. This shift suggests that Creb3, while showing spikes of high activity in absolute terms, exhibits less consistent activity than Creb3L1 when signals are scaled relatively. The enriched processes related to cell communication and response to stimuli align well with Creb3L1’s regulatory role, suggesting that Creb3L1 is crucial in coordinating cellular interactions and responses to environmental changes. This consistent activity of Creb3L1 likely drives the regulatory mechanisms within these enriched processes, highlighting its importance in maintaining cellular homeostasis and responding to external signals.
Creb3 may have shown higher activity in absolute terms due to its role in rapid and dynamic responses to stress and other stimuli, leading to spikes in activity. This characteristic makes Creb3 highly responsive under specific conditions but less consistent overall compared to Creb3L1.
split_regions <- split(rowRanges(ese), rowData(ese)$cluster)
res <- great(split_regions[["4"]], gene_sets="GO:BP", tss_source="hg38", background=regions, cores=2)
## * TSS source: TxDb.
## * check whether TxDb package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is installed.
## * gene ID type in the extended TSS is 'Entrez Gene ID'.
## * restrict chromosomes to 'chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10,
## chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX,
## chrY, chrM'.
## * 18100/30601 protein-coding genes left.
## * update seqinfo to the selected chromosomes.
## * TSS extension mode is 'basalPlusExt'.
## * construct the basal domains by extending 5000bp to upstream and 1000bp to downsteram of TSS.
## * calculate distances to neighbour regions.
## * extend to both sides until reaching the neighbour genes or to the maximal extension.
## * use GO:BP ontology with 15517 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 418 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 2523 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
bp <- getEnrichmentTables(res)
head(bp)
## id description genome_fraction observed_region_hits
## 1 GO:0007154 cell communication 0.5064778 270
## 2 GO:0051716 cellular response to stimulus 0.5683146 294
## 3 GO:0023052 signaling 0.4904877 258
## 4 GO:0050896 response to stimulus 0.6312118 311
## 5 GO:0007165 signal transduction 0.4596951 242
## 6 GO:0042221 response to chemical 0.3266318 180
## fold_enrichment p_value p_adjust mean_tss_dist observed_gene_hits
## 1 1.275343 6.109277e-09 8.276331e-06 122766 209
## 2 1.237605 9.045170e-09 8.276331e-06 125515 230
## 3 1.258390 1.283293e-07 7.828087e-05 124348 206
## 4 1.178715 6.065350e-07 2.434018e-04 121174 262
## 5 1.259416 6.650322e-07 2.434018e-04 127871 186
## 6 1.318371 5.710807e-06 1.741796e-03 102413 115
## gene_set_size fold_enrichment_hyper p_value_hyper p_adjust_hyper
## 1 474 1.247043 7.239169e-07 0.0008319708
## 2 547 1.189197 1.744555e-05 0.0079813370
## 3 467 1.247566 9.092577e-07 0.0008319708
## 4 642 1.154196 5.008222e-05 0.0130929231
## 5 430 1.223370 2.682245e-05 0.0098170169
## 6 282 1.153352 2.002177e-02 0.1936965004
The plot reveals significant enrichment of processes related to cell communication and response to stimuli, as evidenced by their high fold enrichment and low adjusted p-values. Processes such as “multicellular organismal process,” “regulation of cellular process,” and “regulation of biological process” involve a large number of genomic regions but are less statistically significant. This indicates that while these processes are common in the dataset, their enrichment is not as specific or strong compared to processes like “cell communication” and “cellular response to stimulus,” which are crucial for coordinating cellular interactions and responses to environmental changes.
ggplot(head(bp, 15), aes(fold_enrichment, reorder(description, p_adjust), size=observed_region_hits, color=-log10(p_adjust))) +
geom_point() + scale_color_viridis_c()